# Clear workspace
rm(list=ls()); graphics.off()
### Load packages
library(tidyverse) # Collection of all the good stuff like dplyr, ggplot2 ect.
library(magrittr) # For extra-piping operators (eg. %<>%)
library(skimr) # For nice data summaries
The dataset comprises of three main tables:
listings - Detailed listings data showing 96 atttributes for each of the listings. Some of the attributes which are intuitivly interesting are: price (continuous), longitude (continuous), latitude (continuous), listing_type (categorical), is_superhost (categorical), neighbourhood (categorical), ratings (continuous) among others.reviews - Detailed reviews given by the guests with 6 attributes. Key attributes include date (datetime), listing_id (discrete), reviewer_id (discrete) and comment (textual).calendar - Provides details about booking for the next year by listing. Four attributes in total including listing_id (discrete), date (datetime), available (categorical) and price (continuous).listings <- read_csv('http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/data/listings.csv.gz')
listings %>% glimpse()
Rows: 28,523
Columns: 106
$ id <dbl> 6983, 26057, 26473, 29118, 29618, 31094, 32379, 32841, 33850, 3…
$ listing_url <chr> "https://www.airbnb.com/rooms/6983", "https://www.airbnb.com/ro…
$ scrape_id <dbl> 20200626200423, 20200626200423, 20200626200423, 20200626200423,…
$ last_scraped <date> 2020-06-28, 2020-06-28, 2020-06-28, 2020-06-28, 2020-06-29, 20…
$ name <chr> "Copenhagen 'N Livin'", "Lovely house - most attractive area", …
$ summary <chr> "Lovely apartment located in the hip Nørrebro area, close to ba…
$ space <chr> "Beautiful and cosy apartment conveniently located in the hip N…
$ description <chr> "Lovely apartment located in the hip Nørrebro area, close to ba…
$ experiences_offered <chr> "none", "none", "none", "none", "none", "none", "none", "none",…
$ neighborhood_overview <chr> "Nice bars and cozy cafes just minutes away, yet the street its…
$ notes <chr> NA, NA, NA, NA, "Please note that the bed in the second bedroom…
$ transit <chr> "Bus 66 runs to the central station. Forum metro is about 10 mi…
$ access <chr> "Bedroom, living room, kitchen, and bathroom for shared use. Yo…
$ interaction <chr> "We are usually at work during day time, but will be home most …
$ house_rules <chr> "No smoking allowed! No pets.", "We will leave the house clean …
$ thumbnail_url <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ medium_url <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ picture_url <chr> "https://a0.muscache.com/im/pictures/42044170/f63c4d99_original…
$ xl_picture_url <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ host_id <dbl> 16774, 109777, 112210, 125230, 127577, 129976, 140105, 142143, …
$ host_url <chr> "https://www.airbnb.com/users/show/16774", "https://www.airbnb.…
$ host_name <chr> "Simon", "Kari", "Oliver", "Nana", "Simon And Anna", "Ebbe", "L…
$ host_since <date> 2009-05-12, 2010-04-17, 2010-04-22, 2010-05-15, 2010-05-18, 20…
$ host_location <chr> "Copenhagen, Capital Region of Denmark, Denmark", "Copenhagen, …
$ host_about <chr> "I'm currently working as an environmental consultant for a lar…
$ host_response_time <chr> "N/A", "N/A", "within a few hours", "N/A", "N/A", "N/A", "withi…
$ host_response_rate <chr> "N/A", "N/A", "100%", "N/A", "N/A", "N/A", "100%", "N/A", "N/A"…
$ host_acceptance_rate <chr> "33%", "19%", "100%", "17%", "N/A", "N/A", "97%", "0%", "N/A", …
$ host_is_superhost <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
$ host_thumbnail_url <chr> "https://a0.muscache.com/im/users/16774/profile_pic/1401276934/…
$ host_picture_url <chr> "https://a0.muscache.com/im/users/16774/profile_pic/1401276934/…
$ host_neighbourhood <chr> "Nørrebro", "Indre By", "Indre By", "Vesterbro", "Østerbro", "V…
$ host_listings_count <dbl> 1, 1, 4, 1, 1, 1, 3, 1, 0, 2, 1, 1, 2, 1, 1, 1, 1, 1, 0, 1, 1, …
$ host_total_listings_count <dbl> 1, 1, 4, 1, 1, 1, 3, 1, 0, 2, 1, 1, 2, 1, 1, 1, 1, 1, 0, 1, 1, …
$ host_verifications <chr> "['email', 'phone', 'reviews']", "['email', 'phone', 'reviews',…
$ host_has_profile_pic <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRU…
$ host_identity_verified <lgl> FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, TRU…
$ street <chr> "Copenhagen, Hovedstaden, Denmark", "Copenhagen, Hovedstaden, D…
$ neighbourhood <chr> "Nørrebro", "Indre By", "Indre By", "Vesterbro", "Østerbro", "V…
$ neighbourhood_cleansed <chr> "Nrrebro", "Indre By", "Indre By", "Vesterbro-Kongens Enghave",…
$ neighbourhood_group_cleansed <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ city <chr> "Copenhagen", "Copenhagen", "Copenhagen", "Copenhagen", "Copenh…
$ state <chr> "Hovedstaden", "Hovedstaden", "Hovedstaden", "Hovedstaden", "Ho…
$ zipcode <chr> "2200", "2100", "1210", "1650", "2100", "1719", "1620", "2100",…
$ market <chr> "Copenhagen", "Copenhagen", "Copenhagen", "Copenhagen", "Copenh…
$ smart_location <chr> "Copenhagen, Denmark", "Copenhagen, Denmark", "Copenhagen, Denm…
$ country_code <chr> "DK", "DK", "DK", "DK", "DK", "DK", "DK", "DK", "DK", "DK", "DK…
$ country <chr> "Denmark", "Denmark", "Denmark", "Denmark", "Denmark", "Denmark…
$ latitude <dbl> 55.68798, 55.69163, 55.67590, 55.67069, 55.69375, 55.66744, 55.…
$ longitude <dbl> 12.54571, 12.57459, 12.57698, 12.55430, 12.56945, 12.55516, 12.…
$ is_location_exact <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRUE, TR…
$ property_type <chr> "Apartment", "House", "House", "Apartment", "Apartment", "Apart…
$ room_type <chr> "Private room", "Entire home/apt", "Entire home/apt", "Entire h…
$ accommodates <dbl> 2, 6, 12, 2, 4, 3, 3, 4, 5, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 4, 6,…
$ bathrooms <dbl> 1.0, 1.5, 2.5, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0…
$ bedrooms <dbl> 1, 4, 6, 1, 3, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, …
$ beds <dbl> 1, 4, 7, 1, 3, 3, 2, 2, 1, 1, 0, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, …
$ bed_type <chr> "Real Bed", "Real Bed", "Real Bed", "Real Bed", "Real Bed", "Re…
$ amenities <chr> "{TV,\"Cable TV\",Wifi,Kitchen,\"Paid parking off premises\",He…
$ square_feet <dbl> 97, NA, NA, NA, NA, 689, NA, 807, NA, 420, 161, NA, 527, NA, NA…
$ price <chr> "$365.00", "$2,398.00", "$3,096.00", "$797.00", "$857.00", "$75…
$ weekly_price <chr> NA, NA, "$17,513.00", NA, "$2,981.00", "$4,700.00", "$7,453.00"…
$ monthly_price <chr> NA, NA, "$67,073.00", NA, "$8,943.00", NA, "$26,084.00", "$14,1…
$ security_deposit <chr> "$0.00", "$5,000.00", "$3,726.00", NA, NA, "$1,000.00", "$0.00"…
$ cleaning_fee <chr> "$33.00", "$1,100.00", "$522.00", "$300.00", "$75.00", NA, "$0.…
$ guests_included <dbl> 1, 3, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 2, 2, 2, 3, 2, 4, …
$ extra_people <chr> "$66.00", "$350.00", "$0.00", "$0.00", "$0.00", "$100.00", "$33…
$ minimum_nights <dbl> 2, 3, 3, 7, 7, 2, 3, 6, 5, 30, 1, 3, 5, 3, 2, 4, 3, 4, 9, 4, 3,…
$ maximum_nights <dbl> 15, 30, 31, 14, 31, 10, 365, 1125, 21, 90, 30, 730, 14, 15, 112…
$ minimum_minimum_nights <dbl> 2, 3, 3, 3, 7, 2, 3, 6, 5, 30, 1, 3, 5, 3, 2, 4, 3, 4, 9, 4, 3,…
$ maximum_minimum_nights <dbl> 2, 3, 3, 5, 7, 2, 3, 6, 5, 30, 1, 3, 5, 3, 2, 4, 3, 4, 9, 4, 3,…
$ minimum_maximum_nights <dbl> 15, 30, 1125, 14, 1125, 10, 1125, 1125, 1125, 90, 30, 730, 14, …
$ maximum_maximum_nights <dbl> 15, 30, 1125, 14, 1125, 10, 1125, 1125, 1125, 90, 30, 730, 14, …
$ minimum_nights_avg_ntm <dbl> 2.0, 3.0, 3.0, 4.1, 7.0, 2.0, 3.0, 6.0, 5.0, 30.0, 1.0, 3.0, 5.…
$ maximum_nights_avg_ntm <dbl> 15, 30, 1125, 14, 1125, 10, 1125, 1125, 1125, 90, 30, 730, 14, …
$ calendar_updated <chr> "5 months ago", "4 months ago", "7 months ago", "4 months ago",…
$ has_availability <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRU…
$ availability_30 <dbl> 29, 28, 29, 21, 0, 0, 8, 0, 11, 0, 0, 0, 2, 29, 29, 0, 0, 6, 0,…
$ availability_60 <dbl> 59, 58, 59, 21, 0, 0, 8, 0, 24, 0, 0, 0, 13, 59, 59, 0, 0, 36, …
$ availability_90 <dbl> 89, 88, 89, 21, 0, 0, 8, 5, 24, 26, 0, 0, 43, 89, 89, 0, 0, 66,…
$ availability_365 <dbl> 89, 363, 172, 21, 0, 58, 8, 189, 24, 301, 0, 0, 133, 364, 89, 0…
$ calendar_last_scraped <date> 2020-06-28, 2020-06-28, 2020-06-28, 2020-06-28, 2020-06-29, 20…
$ number_of_reviews <dbl> 168, 50, 293, 22, 90, 17, 73, 7, 40, 16, 184, 9, 74, 67, 92, 7,…
$ number_of_reviews_ltm <dbl> 1, 4, 31, 2, 0, 0, 1, 0, 0, 1, 11, 1, 2, 1, 13, 0, 0, 11, 0, 0,…
$ first_review <date> 2009-09-04, 2013-12-02, 2010-10-14, 2010-06-17, 2010-08-16, 20…
$ last_review <date> 2019-07-19, 2019-12-14, 2020-03-02, 2019-08-02, 2017-06-03, 20…
$ review_scores_rating <dbl> 96, 98, 91, 98, 94, 97, 98, 91, 97, 89, 92, 98, 94, 95, 94, 100…
$ review_scores_accuracy <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 10, 10, 9, 9, 9, 10, 9, …
$ review_scores_cleanliness <dbl> 9, 10, 9, 10, 9, 10, 10, 9, 9, 8, 8, 9, 9, 9, 9, 10, 9, 10, 10,…
$ review_scores_checkin <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,…
$ review_scores_communication <dbl> 10, 10, 10, 10, 9, 10, 10, 10, 10, 9, 10, 10, 10, 10, 10, 10, 1…
$ review_scores_location <dbl> 9, 10, 10, 10, 10, 10, 10, 9, 9, 10, 10, 9, 10, 10, 9, 10, 9, 1…
$ review_scores_value <dbl> 9, 10, 9, 10, 9, 9, 9, 9, 10, 9, 9, 9, 9, 9, 9, 10, 9, 10, 10, …
$ requires_license <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
$ license <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ jurisdiction_names <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ instant_bookable <lgl> FALSE, FALSE, FALSE, FALSE, TRUE, FALSE, TRUE, FALSE, FALSE, FA…
$ is_business_travel_ready <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
$ cancellation_policy <chr> "moderate", "moderate", "moderate", "strict_14_with_grace_perio…
$ require_guest_profile_picture <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
$ require_guest_phone_verification <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, …
$ calculated_host_listings_count <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ calculated_host_listings_count_entire_homes <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, …
$ calculated_host_listings_count_private_rooms <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, …
$ calculated_host_listings_count_shared_rooms <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ reviews_per_month <dbl> 1.28, 0.62, 2.48, 0.18, 0.75, 0.14, 0.61, 0.06, 0.33, 0.14, 1.5…
#calendar <- read_csv('http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/data/calendar.csv.gz')
#calendar %>% glimpse()
reviews <- read_csv('http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/data/reviews.csv.gz')
reviews %>% glimpse()
Rows: 386,954
Columns: 6
$ listing_id <dbl> 6983, 6983, 6983, 6983, 6983, 6983, 6983, 6983, 6983, 6983, 6983, 6983, 6983, 6983, 6983, 6983, 6983, …
$ id <dbl> 8985, 9199, 11763, 20616, 23168, 33706, 40860, 51622, 54476, 56504, 1073551, 1133218, 1159109, 1179503…
$ date <date> 2009-09-04, 2009-09-07, 2009-09-28, 2009-12-17, 2010-01-04, 2010-04-05, 2010-05-08, 2010-06-10, 2010-…
$ reviewer_id <dbl> 31133, 11472, 35317, 53151, 50856, 96881, 113545, 135677, 138676, 141069, 366694, 977908, 1960762, 206…
$ reviewer_name <chr> "Jeff", "Andrea", "Jeanine", "Sonja", "James", "Tom", "Orion", "Gonzalo", "Tiffany", "Payton", "Ruifen…
$ comments <chr> "Convenient location, clean apartment, great breakfast...couldn't have asked for more. Best value in l…
# # And the summary plus geodata
# summaries_listings <- read_csv('http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/visualisations/listings.csv')
# summaries_reviews <- read_csv('http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/visualisations/reviews.csv')
# summaries_neighbourhoods <- read_csv('http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/visualisations/neighbourhoods.csv')
# The geodat of the hoods comes as a geojson, so we need the right package to load it
library(geojsonio)
neighbourhoods_geojson <- geojson_read( 'http://data.insideairbnb.com/denmark/hovedstaden/copenhagen/2020-06-26/visualisations/neighbourhoods.geojson', what = "sp")
listings %>%
count(host_id, sort = TRUE)
listings %>%
filter(host_id == 187610263) %>%
count(neighbourhood_cleansed, sort = TRUE)
listings %<>%
mutate(price = price %>% parse_number(),
price_sqf = price / square_feet)
listings %<>%
group_by(host_id) %>%
mutate(host_professional = n() >= 5) %>%
ungroup()
listings %>%
group_by(host_professional) %>%
summarise(review = review_scores_rating %>% mean(na.rm = TRUE),
price = price %>% mean(na.rm = TRUE))
listings %>%
group_by(neighbourhood_cleansed, host_professional) %>%
summarise(review = review_scores_rating %>% mean(na.rm = TRUE)) %>%
pivot_wider(names_from = host_professional, values_from = review)
listings %<>%
mutate(desc_lenght = description %>% str_count('\\w+')) %>%
mutate(desc_long = percent_rank(desc_lenght) > 0.9 )
listings %>%
group_by(desc_long) %>%
summarise(review = review_scores_rating %>% mean(na.rm =TRUE))
listings %>% skim()
── Data Summary ────────────────────────
Values
Name Piped data
Number of rows 28523
Number of columns 110
_______________________
Column type frequency:
character 45
Date 5
logical 18
numeric 42
________________________
Group variables None
── Variable type: character ───────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max empty n_unique whitespace
1 listing_url 0 1 33 37 0 28523 0
2 name 57 0.998 1 211 0 26906 0
3 summary 1096 0.962 1 1000 0 26981 0
4 space 11390 0.601 1 1000 0 16804 0
5 description 515 0.982 1 1000 0 27748 0
6 experiences_offered 0 1 4 4 0 1 0
7 neighborhood_overview 12407 0.565 1 1000 0 15514 0
8 notes 20832 0.270 1 1000 0 7365 0
9 transit 11316 0.603 1 1000 0 16711 0
10 access 15751 0.448 1 1000 0 11249 0
11 interaction 14111 0.505 1 1000 0 13726 0
12 house_rules 13332 0.533 1 1000 0 13703 0
13 picture_url 0 1 81 146 0 28282 0
14 host_url 0 1 37 43 0 25745 0
15 host_name 12 1.00 1 34 0 6415 0
16 host_location 93 0.997 2 151 0 863 0
17 host_about 14028 0.508 1 3550 0 12476 34
18 host_response_time 11 1.00 3 18 0 5 0
19 host_response_rate 11 1.00 2 4 0 44 0
20 host_acceptance_rate 11 1.00 2 4 0 100 0
21 host_thumbnail_url 11 1.00 55 106 0 25669 0
22 host_picture_url 11 1.00 57 109 0 25669 0
23 host_neighbourhood 8007 0.719 1 21 0 56 0
24 host_verifications 0 1 2 156 0 316 0
25 street 0 1 10 61 0 527 0
26 neighbourhood 0 1 5 14 0 21 0
27 neighbourhood_cleansed 0 1 5 25 0 11 0
28 city 13 1.00 1 26 0 134 0
29 state 24042 0.157 1 25 0 170 0
30 zipcode 810 0.972 3 17 0 454 0
31 market 864 0.970 6 21 0 10 0
32 smart_location 0 1 10 35 0 157 0
33 country_code 0 1 2 2 0 1 0
34 country 0 1 7 7 0 1 0
35 property_type 0 1 3 22 0 29 0
36 room_type 0 1 10 15 0 4 0
37 bed_type 0 1 5 13 0 5 0
38 amenities 0 1 2 1179 0 26634 0
39 weekly_price 25008 0.123 7 11 0 718 0
40 monthly_price 26971 0.0544 7 11 0 476 0
41 security_deposit 13845 0.515 5 10 0 386 0
42 cleaning_fee 8968 0.686 5 9 0 430 0
43 extra_people 0 1 5 9 0 265 0
44 calendar_updated 0 1 5 13 0 82 0
45 cancellation_policy 0 1 8 27 0 4 0
── Variable type: Date ────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max median n_unique
1 last_scraped 0 1 2020-06-26 2020-06-30 2020-06-27 5
2 host_since 11 1.00 2008-06-27 2020-06-26 2015-06-22 3379
3 calendar_last_scraped 0 1 2020-06-26 2020-06-30 2020-06-27 5
4 first_review 4968 0.826 2009-09-04 2020-06-28 2017-08-07 2582
5 last_review 4968 0.826 2011-08-10 2020-06-28 2019-07-29 1846
── Variable type: logical ─────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean count
1 thumbnail_url 28523 0 NaN ": "
2 medium_url 28523 0 NaN ": "
3 xl_picture_url 28523 0 NaN ": "
4 host_is_superhost 11 1.00 0.102 "FAL: 25610, TRU: 2902"
5 host_has_profile_pic 11 1.00 0.998 "TRU: 28445, FAL: 67"
6 host_identity_verified 11 1.00 0.376 "FAL: 17790, TRU: 10722"
7 neighbourhood_group_cleansed 28523 0 NaN ": "
8 is_location_exact 0 1 0.789 "TRU: 22507, FAL: 6016"
9 has_availability 0 1 1 "TRU: 28523"
10 requires_license 0 1 0 "FAL: 28523"
11 license 28523 0 NaN ": "
12 jurisdiction_names 28523 0 NaN ": "
13 instant_bookable 0 1 0.286 "FAL: 20360, TRU: 8163"
14 is_business_travel_ready 0 1 0 "FAL: 28523"
15 require_guest_profile_picture 0 1 0.00505 "FAL: 28379, TRU: 144"
16 require_guest_phone_verification 0 1 0.00687 "FAL: 28327, TRU: 196"
17 host_professional 0 1 0.0305 "FAL: 27653, TRU: 870"
18 desc_long 515 0.982 0.0972 "FAL: 25285, TRU: 2723"
── Variable type: numeric ─────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25
1 id 0 1 2.04e+ 7 12261143. 6.98e+ 3 1.01e+ 7
2 scrape_id 0 1 2.02e+13 0 2.02e+13 2.02e+13
3 host_id 0 1 6.73e+ 7 76172316. 5.13e+ 2 1.28e+ 7
4 host_listings_count 11 1.00 5.46e+ 0 34.3 0. 1.00e+ 0
5 host_total_listings_count 11 1.00 5.46e+ 0 34.3 0. 1.00e+ 0
6 latitude 0 1 5.57e+ 1 0.0191 5.56e+ 1 5.57e+ 1
7 longitude 0 1 1.26e+ 1 0.0317 1.24e+ 1 1.25e+ 1
8 accommodates 0 1 3.32e+ 0 1.63 1.00e+ 0 2.00e+ 0
9 bathrooms 12 1.00 1.08e+ 0 0.284 0. 1.00e+ 0
10 bedrooms 29 0.999 1.55e+ 0 1.06 0. 1.00e+ 0
11 beds 103 0.996 2.04e+ 0 1.44 0. 1.00e+ 0
12 square_feet 28130 0.0138 7.22e+ 2 576. 0. 1.20e+ 2
13 price 0 1 8.49e+ 2 1067. 0. 4.98e+ 2
14 guests_included 0 1 1.52e+ 0 1.06 1.00e+ 0 1.00e+ 0
15 minimum_nights 0 1 3.85e+ 0 18.1 1.00e+ 0 2.00e+ 0
16 maximum_nights 0 1 6.21e+ 2 553. 1.00e+ 0 1.50e+ 1
17 minimum_minimum_nights 0 1 3.84e+ 0 18.1 1.00e+ 0 2.00e+ 0
18 maximum_minimum_nights 0 1 4.12e+ 0 19.3 1.00e+ 0 2.00e+ 0
19 minimum_maximum_nights 0 1 6.67e+ 2 548. 1.00e+ 0 2.00e+ 1
20 maximum_maximum_nights 0 1 6.70e+ 2 547. 1.00e+ 0 2.00e+ 1
21 minimum_nights_avg_ntm 0 1 3.97e+ 0 18.3 1.00e+ 0 2.00e+ 0
22 maximum_nights_avg_ntm 0 1 6.68e+ 2 547. 1.00e+ 0 2.00e+ 1
23 availability_30 0 1 5.87e+ 0 10.4 0. 0.
24 availability_60 0 1 1.12e+ 1 20.1 0. 0.
25 availability_90 0 1 1.65e+ 1 30.0 0. 0.
26 availability_365 0 1 4.95e+ 1 99.2 0. 0.
27 number_of_reviews 0 1 1.36e+ 1 27.1 0. 1.00e+ 0
28 number_of_reviews_ltm 0 1 2.74e+ 0 6.64 0. 0.
29 review_scores_rating 5447 0.809 9.52e+ 1 6.83 2.00e+ 1 9.30e+ 1
30 review_scores_accuracy 5468 0.808 9.73e+ 0 0.655 2.00e+ 0 1.00e+ 1
31 review_scores_cleanliness 5466 0.808 9.38e+ 0 0.956 2.00e+ 0 9.00e+ 0
32 review_scores_checkin 5488 0.808 9.82e+ 0 0.561 2.00e+ 0 1.00e+ 1
33 review_scores_communication 5470 0.808 9.86e+ 0 0.529 2.00e+ 0 1.00e+ 1
34 review_scores_location 5491 0.807 9.60e+ 0 0.678 2.00e+ 0 9.00e+ 0
35 review_scores_value 5495 0.807 9.45e+ 0 0.763 2.00e+ 0 9.00e+ 0
36 calculated_host_listings_count 0 1 4.45e+ 0 28.1 1.00e+ 0 1.00e+ 0
37 calculated_host_listings_count_entire_homes 0 1 4.17e+ 0 28.1 0. 1.00e+ 0
38 calculated_host_listings_count_private_rooms 0 1 2.58e- 1 0.731 0. 0.
39 calculated_host_listings_count_shared_rooms 0 1 1.22e- 2 0.325 0. 0.
40 reviews_per_month 4968 0.826 4.95e- 1 0.727 1.00e- 2 1.20e- 1
41 price_sqf 28130 0.0138 Inf NaN 3.36e- 1 8.22e- 1
42 desc_lenght 515 0.982 1.26e+ 2 56.3 0. 7.70e+ 1
p50 p75 p100 hist
1 1.95e+ 7 3.04e+ 7 4.40e 7 ▇▇▇▅▆
2 2.02e+13 2.02e+13 2.02e13 ▁▁▇▁▁
3 3.60e+ 7 9.46e+ 7 3.52e 8 ▇▂▁▁▁
4 1.00e+ 0 1.00e+ 0 7.37e 2 ▇▁▁▁▁
5 1.00e+ 0 1.00e+ 0 7.37e 2 ▇▁▁▁▁
6 5.57e+ 1 5.57e+ 1 5.57e 1 ▁▃▇▇▁
7 1.26e+ 1 1.26e+ 1 1.26e 1 ▁▂▇▅▂
8 3.00e+ 0 4.00e+ 0 1.60e 1 ▇▂▁▁▁
9 1.00e+ 0 1.00e+ 0 1.00e 1 ▇▁▁▁▁
10 1.00e+ 0 2.00e+ 0 1.01e 2 ▇▁▁▁▁
11 2.00e+ 0 3.00e+ 0 2.50e 1 ▇▁▁▁▁
12 7.64e+ 2 1.08e+ 3 2.80e 3 ▇▇▃▁▁
13 6.98e+ 2 9.96e+ 2 6.92e 4 ▇▁▁▁▁
14 1.00e+ 0 2.00e+ 0 1.60e 1 ▇▁▁▁▁
15 3.00e+ 0 4.00e+ 0 1.10e 3 ▇▁▁▁▁
16 1.12e+ 3 1.12e+ 3 1.00e 4 ▇▁▁▁▁
17 3.00e+ 0 4.00e+ 0 1.10e 3 ▇▁▁▁▁
18 3.00e+ 0 4.00e+ 0 1.10e 3 ▇▁▁▁▁
19 1.12e+ 3 1.12e+ 3 1.00e 4 ▇▁▁▁▁
20 1.12e+ 3 1.12e+ 3 1.00e 4 ▇▁▁▁▁
21 3.00e+ 0 4.00e+ 0 1.10e 3 ▇▁▁▁▁
22 1.12e+ 3 1.12e+ 3 1.00e 4 ▇▁▁▁▁
23 0. 8.00e+ 0 3.00e 1 ▇▁▁▁▂
24 0. 1.40e+ 1 6.00e 1 ▇▁▁▁▂
25 0. 1.80e+ 1 9.00e 1 ▇▁▁▁▁
26 0. 3.50e+ 1 3.65e 2 ▇▁▁▁▁
27 5.00e+ 0 1.50e+ 1 6.37e 2 ▇▁▁▁▁
28 0. 3.00e+ 0 3.76e 2 ▇▁▁▁▁
29 9.70e+ 1 1.00e+ 2 1.00e 2 ▁▁▁▁▇
30 1.00e+ 1 1.00e+ 1 1.00e 1 ▁▁▁▁▇
31 1.00e+ 1 1.00e+ 1 1.00e 1 ▁▁▁▁▇
32 1.00e+ 1 1.00e+ 1 1.00e 1 ▁▁▁▁▇
33 1.00e+ 1 1.00e+ 1 1.00e 1 ▁▁▁▁▇
34 1.00e+ 1 1.00e+ 1 1.00e 1 ▁▁▁▁▇
35 1.00e+ 1 1.00e+ 1 1.00e 1 ▁▁▁▁▇
36 1.00e+ 0 1.00e+ 0 2.81e 2 ▇▁▁▁▁
37 1.00e+ 0 1.00e+ 0 2.81e 2 ▇▁▁▁▁
38 0. 0. 1.20e 1 ▇▁▁▁▁
39 0. 0. 1.30e 1 ▇▁▁▁▁
40 2.80e- 1 5.90e- 1 3.06e 1 ▇▁▁▁▁
41 1.12e+ 0 6.59e+ 0 Inf ▇▁▁▁▁
42 1.51e+ 2 1.75e+ 2 2.10e 2 ▂▃▃▅▇
listings %<>%
mutate(across(is_character, ~ifelse(.x == "", NA, .x)))
library(VIM)
listings %>%
select(host_is_superhost, review_scores_rating, host_response_time, name, host_since,zipcode) %>%
aggr(numbers = TRUE, prop = c(TRUE, FALSE))
listings %<>%
mutate(party_place = accommodates >= 10)
listings %>%
filter(party_place == TRUE) %>%
group_by(neighbourhood_cleansed) %>%
summarize(n = n(),
review = review_scores_rating %>% mean(na.rm = TRUE),
price = price %>% mean(na.rm = TRUE) ) %>%
arrange(desc(n))
mad_map <- get_map(getbb("Madrid"), maptype = "toner-background")
Error in getbb("Madrid") : could not find function "getbb"
listings %>% leaflet() %>%
addTiles() %>%
addMarkers(~longitude, ~latitude,
labelOptions = labelOptions(noHide = F),
clusterOptions = markerClusterOptions(),
popup = paste0("<b> Name: </b>", listings$name,
"<br/><b> Host Name: </b>", listings$host_name,
"<br> <b> Price: </b>", listings$price,
"<br/><b> Room Type: </b>", listings$room_type,
"<br/><b> Property Type: </b>", listings$property_type
)) %>%
# setView(-74.00, 40.71, zoom = 12) %>%
addProviderTiles("CartoDB.Positron")
# I need to fortify the data AND keep trace of the commune code! (Takes ~2 minutes)
library(broom)
neighbourhoods_tidy <- neighbourhoods_geojson %>%
tidy(region = "neighbourhood")
neighbourhoods_tidy %>% glimpse()
Rows: 6,658
Columns: 7
$ long <dbl> 12.63094, 12.63126, 12.63221, 12.63160, 12.63154, 12.63153, 12.63153, 12.63153, 12.63157, 12.63158, 12…
$ lat <dbl> 55.67050, 55.67028, 55.66961, 55.66943, 55.66941, 55.66940, 55.66939, 55.66930, 55.66926, 55.66924, 55…
$ order <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,…
$ hole <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
$ piece <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ group <fct> Amager st.1, Amager st.1, Amager st.1, Amager st.1, Amager st.1, Amager st.1, Amager st.1, Amager st.1…
$ id <chr> "Amager st", "Amager st", "Amager st", "Amager st", "Amager st", "Amager st", "Amager st", "Amager st"…
neighbourhoods_tidy %>%
ggplot(aes(x = long, y = lat, group = group)) +
geom_polygon() +
theme_void() +
coord_map()
neighborhood_agg <- listings %>%
group_by(neighbourhood_cleansed) %>%
summarise(n = n(),
price_mean = price %>% mean(na.rm = TRUE),
review_mean = review_scores_rating %>% mean(na.rm = TRUE))
neighbourhoods_tidy %<>%
left_join(neighborhood_agg, by = c('id' = 'neighbourhood_cleansed'))
neighbourhoods_tidy %>%
ggplot(aes(x = long, y = lat, group = group, fill = n)) +
geom_polygon() +
theme_void() +
coord_map()
neighbourhoods_tidy %>%
ggplot(aes(x = long, y = lat, group = group, fill = price_mean)) +
geom_polygon() +
theme_void() +
coord_map()
neighbourhoods_tidy %>%
ggplot(aes(x = long, y = lat, group = group, fill = review_mean)) +
geom_polygon() +
theme_void() +
coord_map()
Inspect again…
listings %>% skim(neighbourhood_cleansed, accommodates, price, room_type, host_is_superhost )
── Data Summary ────────────────────────
Values
Name Piped data
Number of rows 28523
Number of columns 111
_______________________
Column type frequency:
character 2
logical 1
numeric 2
________________________
Group variables None
── Variable type: character ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max empty n_unique whitespace
1 neighbourhood_cleansed 0 1 5 25 0 11 0
2 room_type 0 1 10 15 0 4 0
── Variable type: logical ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean count
1 host_is_superhost 11 1.00 0.102 FAL: 25610, TRU: 2902
── Variable type: numeric ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
1 accommodates 0 1 3.32 1.63 1 2 3 4 16 ▇▂▁▁▁
2 price 0 1 849. 1067. 0 498 698 996 69175 ▇▁▁▁▁
data <- listings %>%
select(review_scores_rating, neighbourhood_cleansed, accommodates, price, room_type, host_is_superhost) %>%
rename(y = price) %>%
relocate(y, everything()) %>%
drop_na()
data %>% count(neighbourhood_cleansed, sort = TRUE)
data %>% count(room_type, sort = TRUE)
data %<>%
filter(!(room_type %in% c('hared room', 'Hotel room')))
data %<>%
filter(percent_rank(y) <0.99)
data %>%
skim()
── Data Summary ────────────────────────
Values
Name Piped data
Number of rows 22807
Number of columns 6
_______________________
Column type frequency:
character 2
logical 1
numeric 3
________________________
Group variables None
── Variable type: character ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max empty n_unique whitespace
1 neighbourhood_cleansed 0 1 5 25 0 11 0
2 room_type 0 1 11 15 0 3 0
── Variable type: logical ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean count
1 host_is_superhost 0 1 0.121 FAL: 20048, TRU: 2759
── Variable type: numeric ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
1 y 0 1 771. 391. 0 498 698 950 2890 ▅▇▂▁▁
2 review_scores_rating 0 1 95.2 6.79 20 93 97 100 100 ▁▁▁▁▇
3 accommodates 0 1 3.25 1.55 1 2 3 4 16 ▇▂▁▁▁